#import adult dataset from openml and split into train test and val and test on polynomial features and svc pipeline and measure evaluation time and performance
import time
from experiments.utils import load_dataset_safely, seed_everything
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, PolynomialFeatures
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

if __name__ == "__main__":
    seed_everything(42)

    # 1) Load dataset via robust helper (handles '?' and encodes y)
    ds, msg = load_dataset_safely("adult")
    if ds is None:
        raise RuntimeError(msg)
    print(msg)

    X_train, X_val, X_test = ds["X_train"], ds["X_val"], ds["X_test"]
    y_train, y_val, y_test = ds["y_train"], ds["y_val"], ds["y_test"]

    # 2) Identify numeric/categorical columns
    num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
    cat_cols = X_train.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

    # 3) Build safe preprocessors
    numeric_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        # Apply polynomial features ONLY on numeric branch to avoid blow-ups
        ("poly", PolynomialFeatures(degree=2, include_bias=False))
    ])

    categorical_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ordinal", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_pipe, num_cols),
            ("cat", categorical_pipe, cat_cols),
        ],
        sparse_threshold=0.0  # force dense for SVC compatibility
    )

    # 4) Define estimator
    clf = SVC(kernel="rbf", C=1.0, gamma="scale", max_iter=2000)

    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("svc", clf)
    ])

    # 5) Train & measure time
    start_time = time.time()
    pipeline.fit(X_train, y_train)
    end_time = time.time()
    train_time = end_time - start_time

    # 6) Evaluate
    y_val_pred = pipeline.predict(X_val)
    y_test_pred = pipeline.predict(X_test)

    val_acc = accuracy_score(y_val, y_val_pred)
    val_f1 = f1_score(y_val, y_val_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    print(f"Training Time: {train_time:.2f} seconds")
    print(f"Validation Accuracy: {val_acc:.4f}, F1: {val_f1:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}, F1: {test_f1:.4f}")
